{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import glob\n",
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.model_selection import cross_validate\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "from sklearn import svm\n",
    "from sklearn import neighbors\n",
    "from sklearn import naive_bayes\n",
    "from sklearn.svm import LinearSVC\n",
    "# from xgboost import XGBClassifier\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.decomposition import TruncatedSVD \n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.gaussian_process import GaussianProcessClassifier\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from sklearn.externals import joblib\n",
    "\n",
    "%config InlineBackend.figure_format = 'svg'\n",
    "%matplotlib inline\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def yield_origin_csv():\n",
    "    flag = 1\n",
    "    id_, api_name_list, exinfos_list = [], [], []\n",
    "    api_name_regex = re.compile('<action api_name=\"(.*?)\" call_name')\n",
    "    exinfos_regex = re.compile('<exInfo value=\"(.*?)\"')\n",
    "    for path in glob.glob(\"./stage2_dataset/*\"):\n",
    "        with open(path, \"r\") as fp:\n",
    "            xml = fp.read()\n",
    "        api_names = re.findall(api_name_regex, xml)\n",
    "        exinfos = re.findall(exinfos_regex, xml)\n",
    "        api_name_list.append(\" \".join(api_names))\n",
    "        dll_exinfos = [ef.split(\"\\\\\")[-1].split('.')[0] for ef in exinfos \n",
    "                       if ef.endswith(\".dll\") and ef.startswith(\"C:\")]\n",
    "        id_.append(path.split(\".\")[1].split(\"/\")[-1]) \n",
    "        exinfos_list.append(\" \".join(dll_exinfos))\n",
    "        \n",
    "        \n",
    "        if flag % 300 == 0:\n",
    "            print(flag)\n",
    "        flag += 1\n",
    "    df = pd.DataFrame()\n",
    "    df[\"id\"] = id_\n",
    "    df[\"api_name\"] = api_name_list\n",
    "    df[\"exinfos\"] = exinfos_list\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data = yield_origin_csv()\n",
    "data = pd.read_csv(\"stage2_api_name_exinfos.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>api_name</th>\n",
       "      <th>exinfos</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b...</td>\n",
       "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...</td>\n",
       "      <td>user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083...</td>\n",
       "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...</td>\n",
       "      <td>mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743...</td>\n",
       "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...</td>\n",
       "      <td>user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>c97a29518ee63fecae29dd973941b8395bd3aaceb11c52...</td>\n",
       "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...</td>\n",
       "      <td>user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5...</td>\n",
       "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...</td>\n",
       "      <td>mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  id  \\\n",
       "0  3ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b...   \n",
       "1  2dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083...   \n",
       "2  fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743...   \n",
       "3  c97a29518ee63fecae29dd973941b8395bd3aaceb11c52...   \n",
       "4  fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5...   \n",
       "\n",
       "                                            api_name  \\\n",
       "0  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...   \n",
       "1  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...   \n",
       "2  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...   \n",
       "3  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...   \n",
       "4  AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...   \n",
       "\n",
       "                                             exinfos  \n",
       "0  user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt...  \n",
       "1  mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32...  \n",
       "2  user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ...  \n",
       "3  user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ...  \n",
       "4  mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.fillna(method=\"ffill\", inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 60000 entries, 0 to 59999\n",
      "Data columns (total 3 columns):\n",
      "id          60000 non-null object\n",
      "api_name    60000 non-null object\n",
      "exinfos     60000 non-null object\n",
      "dtypes: object(3)\n",
      "memory usage: 1.4+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_name_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)\n",
    "api_name_train_tfidf_features = api_name_vectorizer.fit_transform(data[\"api_name\"].tolist())\n",
    "\n",
    "exinfos_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)\n",
    "exinfos_train_tfidf_features = exinfos_vectorizer.fit_transform(data[\"exinfos\"].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"api_name_train_tfidf_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(api_name_train_tfidf_features, fp)\n",
    "with open(\"exinfos_train_tfidf_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(exinfos_train_tfidf_features, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_name_train_tfidf_features = pd.read_pickle(\"api_name_train_tfidf_features.pkl\")\n",
    "exinfos_train_tfidf_features = pd.read_pickle(\"exinfos_svded_features.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "svd = TruncatedSVD(n_components=1000, algorithm=\"arpack\", random_state=0)\n",
    "svded_train = svd.fit_transform(api_name_train_tfidf_features.tolil())\n",
    "svd = TruncatedSVD(n_components=10000, algorithm=\"arpack\", random_state=0)\n",
    "exinfos_svded_train = svd.fit_transform(exinfos_train_tfidf_features.tolil())\n",
    "with open(\"api_name_svded_10000_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(svded_train, fp)\n",
    "with open(\"exinfos_svded_10000_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(exinfos_svded_train, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_name_svded_train = pd.read_pickle(\"api_name_svded_features.pkl\")\n",
    "exinfos_svded_train = pd.read_pickle(\"exinfos_svded_features.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_data = np.hstack([api_name_svded_train, exinfos_svded_train])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60000, 2000)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans = KMeans(n_clusters=50, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = kmeans.fit_predict(merge_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.DataFrame()\n",
    "result[\"id\"] = data[\"id\"]\n",
    "result[\"family_id\"] = y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = pd.read_csv(\"result.csv\")[\"family_id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "exinfos = pd.read_pickle(\"exinfos_svded_features.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.manifold import TSNE\n",
    "\n",
    "X_tsne = TSNE(n_components=2, random_state=33).fit_transform(merge_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"api_name_exinfos_stne_data.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(X_tsne, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_tsne =  pd.read_pickle(\"call_name_tsne_data.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "font = {\"color\": \"darkred\",\n",
    "        \"size\": 13, \n",
    "        \"family\" : \"serif\"}\n",
    "\n",
    "plt.style.use(\"dark_background\")\n",
    "plt.figure()\n",
    "plt.scatter(X_tsne[:, 0], X_tsne[:, 1])\n",
    "plt.title(\"origin_data_t-SNE\", fontdict=font)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = pd.read_csv(\"34.78_k=100.csv\")[\"family_id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "font = {\"color\": \"darkred\",\n",
    "        \"size\": 13, \n",
    "        \"family\" : \"serif\"}\n",
    "\n",
    "plt.style.use(\"dark_background\")\n",
    "plt.figure()\n",
    "plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred.values, alpha=0.6, \n",
    "            cmap=plt.cm.get_cmap('rainbow', 100))\n",
    "plt.title(\"api_name_and_exinfos_t-SNE\", fontdict=font)\n",
    "cbar = plt.colorbar() \n",
    "cbar.set_label(label='family id', fontdict=font)\n",
    "plt.clim(-5, 100)\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"api_name_and_exinfos_TSNE.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "call_name_svded_features = pd.read_pickle(\"call_name_svded_features.pkl\")\n",
    "api_name_svded_features = pd.read_pickle(\"api_name_svded_features.pkl\")\n",
    "exinfos_svded_features = pd.read_pickle(\"exinfos_svded_features.pkl\")\n",
    "merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans = KMeans(n_clusters=100, random_state=0)\n",
    "y_pred = kmeans.fit_predict(merge_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster = 100\n",
    "result = pd.DataFrame()\n",
    "result[\"id\"] = pd.read_csv(\"id.csv\", names=[\"id\"])[\"id\"]\n",
    "result[\"family_id\"] = y_pred\n",
    "\n",
    "result.to_csv(f\"k-means_cluster={cluster}_result.csv\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.6399999999999997"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "10.53 - 7.89"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.89"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2.63 * 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
